It looks like Raustats might not be what I am looking for to get SLA level census data quickly. Let"s try Census2016 from Hugh Parsonage.
full_2016_census <- Census2016_wide_by_SA2_year %>%
filter(year == "2016")
head(full_2016_census)
Yes - this is what I need.
Loading other tables
ancestories_2016 <- Census2016_ancestories %>%
filter(year == "2016")
countries_of_birth_2016 <- Census2016_countries_of_birth %>%
filter(year == "2016")
languages_2016 <- Census2016_languages %>%
filter(year == "2016")
religions_2016 <- Census2016_religions %>%
filter(year == "2016")
Each of the four variables ancestory, country of birth, languages and religions are quite granular, and it may make sense to look at these variables at a lower level of granularity.
Country of birth
if (!file.exists("./Data/country_classification.xls")) {
download.file("https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&sacc_12690do0001_201903.xls&1269.0&Data%20Cubes&480BD730AF42D515CA2583BD007707C5&0&2016&15.03.2019&Latest", "./Data/country_classification.xls", method = "libcurl")
}
country_classification_4dig <-
readxl::read_xls("./Data/country_classification.xls",
sheet = "Table 1.3", skip = 7,
col_names = c("X1", "X2", "Country_Code_4", "Country")) %>%
filter(!is.na(Country)) %>%
dplyr::select(-X1, -X2)
country_classification_2dig <-
readxl::read_xls("./Data/country_classification.xls",
sheet = "Table 1.2", skip = 6,
col_names = c("X1", "Country_Code_2", "Country_Name_2")) %>%
filter(!is.na(Country_Name_2)) %>%
dplyr::select(-X1)
country_classification_1dig <-
readxl::read_xls("./Data/country_classification.xls",
sheet = "Table 1.1", skip = 5,
col_names = c("Country_Code_1", "Country_Group")) %>%
filter(!is.na(Country_Group))
Language
if (!file.exists("./Data/language_classification.xls")) {
download.file("https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&ASCL_12670DO0001_201703.xls&1267.0&Data%20Cubes&F84620CF6E13F7E8CA257FF1001E68A7&0&2016&28.03.2017&Latest", "./Data/language_classification.xls", method = "libcurl")
}
language_classification_4dig <-
readxl::read_xls("./Data/language_classification.xls",
sheet = "Table 1.3", skip = 8,
col_names =
c("X1", "X2", "X3", "X4",
"Language_Code_3", "Language")) %>%
filter(!is.na(Language)) %>%
dplyr::select(-X1, -X2, -X3, -X4)
language_classification_1dig <-
readxl::read_xls("./Data/language_classification.xls",
sheet = "Table 1.1", skip = 5,
col_names = c("Language_Code_1", "Language_Group")) %>%
filter(!is.na(Language_Group))
Religion
if (!file.exists("./Data/religion_classification.xls")) {
download.file("https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&ASCRG_12660DO0001_201707.xls&1266.0&Data%20Cubes&B3EAFE3FE6180D37CA257FF1001E673C&0&2016&14.07.2017&Latest", "./Data/religion_classification.xls", method = "libcurl")
}
religion_classification_3dig <-
readxl::read_xls("./Data/religion_classification.xls",
sheet = "Table 1.2", skip = 6,
col_names = c("X1", "Religion_Code_3", "Religion")) %>%
filter(!is.na(Religion)) %>%
dplyr::select(-X1)
religion_classification_1dig <-
readxl::read_xls("./Data/religion_classification.xls",
sheet = "Table 1.1", skip = 5,
col_names = c("Religion_Code_1", "Religion_Group")) %>%
filter(!is.na(Religion_Group))
Aggregate at the SA2 level - add unless variable contains median, average, persons_per_bedroom
census_2016_all_vars <- Census2016_wide_by_SA2_year %>%
filter(year == "2016") %>%
rowwise() %>%
mutate(sa2_id = paste0(substr(sa2_code, 1, 1), substr(sa2_code, 6, 9))) %>%
filter(isMissing == FALSE) %>%
mutate(percent_female = female / persons,
percent_defacto = defacto_persons / persons,
percent_married = married_persons / persons,
percent_indig = indig_persons / persons,
percent_born_in_australia = born_in_australia / persons,
percent_unit = flat_or_unit / n_dwellings,
percent_mortgage = dwelling_owned_mortgage / n_dwellings,
percent_rent = dwelling_rented / n_dwellings)
census_2016_means <- census_2016_all_vars %>%
dplyr::select(median_age, median_household_income, average_household_size,
persons_per_bedroom, median_weekly_rent,
median_annual_mortgage, sa2_id) %>%
group_by(sa2_id) %>%
summarise_all(mean, na.rm = TRUE)
census_2016_counts <- census_2016_all_vars %>%
dplyr::select(n_dwellings, persons, female, male,
married_persons, married_females, married_males,
defacto_persons, defacto_females, defacto_males,
notmarried_persons, notmarried_females, notmarried_males,
indig_persons, indig_males, indig_females, non_indig_persons,
non_indig_females, non_indig_males, not_stated_indig_persons,
not_stated_indig_males, not_stated_indig_females,
born_in_australia, born_overseas, country_not_stated,
separate_house, flat_or_unit,
housing_other_or_not_stated, semi_or_townhouse,
dwelling_owned_outright, dwelling_owned_mortgage,
dwelling_other_or_not_stated, dwelling_rented, sa2_id) %>%
group_by(sa2_id) %>%
summarise_all(sum, na.rm = TRUE) %>%
mutate(percent_female = female / persons,
percent_defacto = defacto_persons / persons,
percent_married = married_persons / persons,
percent_indig = indig_persons / persons,
percent_born_in_australia = born_in_australia / persons,
percent_unit = flat_or_unit / n_dwellings,
percent_mortgage = dwelling_owned_mortgage / n_dwellings,
percent_rent = dwelling_rented / n_dwellings)
So what I need is weighted demographic data for each of the polling places based on the number of people from each SLA2 who voted at the polling place. Since we don’t know who voted where, and who can vote at all, we are making the naive assumptions that * Voters at each SLA are similar * Voters are representative of census respondents at the SLA2 level.
Download and load polling places by SA1
if (!file.exists("./Data/polling-place-by-sa1s-2016.xlsx")) {
download.file("https://www.aec.gov.au/Elections/Federal_Elections/2016/files/polling-place-by-sa1s-2016.xlsx",
"./Data/polling-place-by-sa1s-2016.xlsx", method = "libcurl")
}
polling_place_data <-
readxl::read_xlsx("./Data/polling-place-by-sa1s-2016.xlsx")
Aggregate polling place data to SA2
polling_place_sa2 <- polling_place_data %>%
mutate(sa2_id = floor(SA1_id / 100)) %>%
group_by(year, state_ab, div_nm, pp_id, pp_nm, sa2_id) %>%
summarise(votes = sum(votes))
Combine with demographic data and aggregate
polling_place_demog <- polling_place_sa2 %>%
mutate(sa2_id = as.character(sa2_id)) %>%
inner_join(census_2016_all_vars)
polling_place_demog_means <- polling_place_demog %>%
dplyr::select(year, state_ab, div_nm, pp_id, pp_nm, sa2_id, votes,
median_age, median_household_income, average_household_size,
persons_per_bedroom, median_weekly_rent,
median_annual_mortgage, percent_female, percent_defacto,
percent_married, percent_indig, percent_born_in_australia,
percent_unit, percent_mortgage, percent_rent) %>%
group_by(year, state_ab, div_nm, pp_id, pp_nm) %>%
summarise_at(vars(median_age, median_household_income,
average_household_size, persons_per_bedroom,
median_weekly_rent, median_annual_mortgage, percent_female,
percent_defacto, percent_married, percent_indig,
percent_born_in_australia, percent_unit, percent_mortgage,
percent_rent), funs(weighted.mean(., w = votes)))
Add in 2pp at the polling booth level
election_2pp <- twoparty_pollingbooth_download()
polling_place_2pp <- polling_place_demog_means %>%
group_by() %>%
rename(StateAb = state_ab,
DivisionNm = div_nm,
PollingPlace = pp_nm,
PollingPlaceID = pp_id) %>%
mutate(DivisionNm = toupper(DivisionNm),
PollingPlace = toupper(PollingPlace)) %>%
left_join(election_2pp %>%
filter(year == 2016))
Check for missing data
polling_place_2pp %>%
summarise_all(funs(sum(is.na(.))))
Which booths are null?
polling_place_2pp %>%
filter(is.na(TotalVotes)) %>%
tabyl(PollingPlace)
So the Absent, Postals, Pre-Poll and Provisional votes aren’t in this table. Let"s come back to those…
polling_place_2pp %>%
summarise_all(funs(sum(is.null(.))))
Remove the rows with NAs
polling_place_2pp_clean <- polling_place_2pp %>%
filter(!is.na(TotalVotes))
polling_place_2pp_clean %>%
summarise_all(funs(sum(is.na(.))))
Which polling stations have missing data? Not too concerned about post code, as there are some special booths
polling_place_2pp_clean %>%
filter(is.na(median_age) | is.na(Latitude))
Looks like mobile teams and prepoll centres, and only latitude and longitude. Will remove the Brand mobile team, as the demographic data does not look valid.
polling_place_2pp_clean <- polling_place_2pp_clean %>%
dplyr::filter(PollingPlaceID != 65161)
Look at variable summaries
skimr::skim_tee(polling_place_2pp_clean)
## -- Data Summary ------------------------
## Values
## Name data
## Number of rows 8167
## Number of columns 29
## _______________________
## Column type frequency:
## character 3
## numeric 26
## ________________________
## Group variables None
##
## -- Variable type: character ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## # A tibble: 3 x 8
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## * <chr> <int> <dbl> <int> <int> <int> <int> <int>
## 1 StateAb 0 1 2 3 0 8 0
## 2 DivisionNm 0 1 4 15 0 150 0
## 3 PollingPlace 0 1 3 38 0 7308 0
##
## -- Variable type: numeric ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## # A tibble: 26 x 11
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
## * <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 year 0 1 2016 0 2016 2016 2016 2016 2016 <U+2581><U+2581><U+2587><U+2581><U+2581>
## 2 PollingPlaceID 0 1 16942. 21513. 1 3112. 6453 31664. 82815 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 3 median_age 0 1 40.3 5.47 22.9 36.3 40.0 43.9 61.8 <U+2581><U+2586><U+2587><U+2582><U+2581>
## 4 median_household_income 0 1 74568. 20909. 39294. 58443. 70395. 87600. 154668. <U+2587><U+2587><U+2585><U+2582><U+2581>
## 5 average_household_size 0 1 2.57 0.304 1.63 2.36 2.51 2.75 5.32 <U+2583><U+2587><U+2581><U+2581><U+2581>
## 6 persons_per_bedroom 0 1 0.832 0.105 0.690 0.796 0.8 0.884 1.98 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 7 median_weekly_rent 0 1 316. 110. 16.2 240 319. 382. 786. <U+2582><U+2587><U+2587><U+2581><U+2581>
## 8 median_annual_mortgage 0 1 20753. 5631. 266. 16468. 20771. 24121. 41242. <U+2581><U+2583><U+2587><U+2583><U+2581>
## 9 percent_female 0 1 0.504 0.0156 0.315 0.497 0.507 0.514 0.541 <U+2581><U+2581><U+2581><U+2582><U+2587>
## 10 percent_defacto 0 1 0.0761 0.0218 0.0202 0.0652 0.0767 0.0863 0.249 <U+2583><U+2587><U+2581><U+2581><U+2581>
## 11 percent_married 0 1 0.346 0.0502 0.0598 0.320 0.352 0.378 0.507 <U+2581><U+2581><U+2583><U+2587><U+2581>
## 12 percent_indig 0 1 0.0339 0.0638 0.000863 0.00816 0.0186 0.0395 0.954 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 13 percent_born_in_australia 0 1 0.707 0.125 0.192 0.622 0.745 0.807 0.987 <U+2581><U+2581><U+2583><U+2587><U+2582>
## 14 percent_unit 52 0.994 0.0869 0.137 0 0.0107 0.0302 0.0935 0.802 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 15 percent_mortgage 52 0.994 0.319 0.0873 0.00238 0.262 0.307 0.368 0.687 <U+2581><U+2585><U+2587><U+2582><U+2581>
## 16 percent_rent 52 0.994 0.268 0.0990 0.0498 0.200 0.256 0.320 0.891 <U+2585><U+2587><U+2582><U+2581><U+2581>
## 17 LNP_Votes 0 1 683. 845. 0 202 498 881 10094 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 18 LNP_Percent 0 1 52.1 15.7 0 41.1 52.8 63.1 100 <U+2581><U+2583><U+2587><U+2585><U+2581>
## 19 ALP_Votes 0 1 685. 799. 0 145 523 956. 9481 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 20 ALP_Percent 0 1 47.8 15.7 0 36.9 47.2 58.8 100 <U+2581><U+2585><U+2587><U+2583><U+2581>
## 21 TotalVotes 0 1 1368. 1549. 0 366. 1133 1840 16396 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 22 Swing 0 1 -2.90 7.61 -65.7 -6.11 -2.88 0.14 85.7 <U+2581><U+2583><U+2587><U+2581><U+2581>
## 23 Latitude 33 0.996 -32.7 6.80 -43.4 -36.7 -33.9 -30.8 57.9 <U+2587><U+2581><U+2581><U+2581><U+2581>
## 24 Longitude 33 0.996 144. 15.2 -95.7 145. 147. 151. 168. <U+2581><U+2581><U+2581><U+2581><U+2587>
## 25 DivisionID 0 1 187. 55.2 101 139 183 225 317 <U+2587><U+2587><U+2587><U+2585><U+2582>
## 26 PremisesPostCode 388 0.952 3784. 1456. 800 2539 3377 4742. 7470 <U+2581><U+2587><U+2585><U+2583><U+2582>
polling_place_2pp_clean %>%
ggplot(aes(x = LNP_Percent / 100)) +
stat_density(geom = "line", colour = "blue") +
theme_classic(base_size = 16) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: LNP 2 Party Preferred Percentage",
x = "2PP Percentage", y = "Density",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(x = ALP_Percent / 100)) +
stat_density(geom = "line", colour = "red") +
theme_classic(base_size = 16) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
x = "2PP Percentage", y = "Density",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(x = Swing / 100)) +
stat_density(geom = "line", colour = "purple") +
theme_classic(base_size = 16) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: Swing to Incumbent", x = "Swing",
y = "Density", subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(x = median_household_income)) +
stat_density(geom = "line", colour = "purple") +
theme_classic(base_size = 16) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Census: Median Income", x = "Median Income",
y = "Density", subtitle = "by Polling Booth, Unweighted")
Can we look at these distributions by state?
polling_place_2pp_clean %>%
ggplot(aes(x = ALP_Percent / 100, colour = StateAb)) +
stat_density(geom = "line", position = "dodge") +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
x = "2PP Percentage", y = "Frequency",
subtitle = "by Polling Booth, Unweighted")
What about by median income
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
colour = StateAb)) +
geom_point() +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
x = "Booth Median Income",
y = "ALP 2pp Percentage", subtitle = "by Polling Booth, Unweighted") +
facet_wrap(~StateAb, nrow = 4)
What about comparing NSW electorates? There seems to be an odd separation in income bands for low ALP 2pp. Could this be a regional vs city difference?
fp_booth_16 <- firstpref_pollingbooth_download() %>%
filter(year == 2016)
polling_2cp <- read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseTcpByCandidateByPollingPlaceDownload-20499.csv", skip = 1)
polling_2pp <- read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseTppByPollingPlaceDownload-20499.csv", skip = 1)
fp_booth_2016 <- read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-NSW.csv", skip = 1) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-VIC.csv", skip = 1)) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-QLD.csv", skip = 1)) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-SA.csv", skip = 1)) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-WA.csv", skip = 1)) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-TAS.csv", skip = 1)) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-NT.csv", skip = 1)) %>%
rbind(read_csv("https://results.aec.gov.au/20499/Website/Downloads/HouseStateFirstPrefsByPollingPlaceDownload-20499-ACT.csv", skip = 1))
coalition_contest_2016 <- fp_booth_2016 %>%
group_by(DivisionNm, PartyNm, HistoricElected) %>%
summarise(OrdinaryVotes = sum(OrdinaryVotes)) %>%
filter(PartyNm %in% c("Liberal", "Country Liberals (NT)",
"Liberal National Party of Queensland",
"The Nationals")) %>%
group_by(DivisionNm) %>%
top_n(1) %>%
dplyr::select(DivisionNm, PartyNm)
If we look at a couple of the states where high income booths tend to vote strongly for the coalition as well as lower income booths, we can see that some (but not all) of the lower income booths are contested by The Nationals. This indicares (not surprisingly) that Nationals voters and Liberal voters are different socio-economically, or possibly that city and country coalition voters differ.
polling_place_2pp_clean %>%
mutate(DivisionNm = stringr::str_to_title(DivisionNm)) %>%
inner_join(coalition_contest_2016) %>%
filter(StateAb == "NSW") %>%
ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
colour = PartyNm)) +
geom_point(size = 3) +
theme_classic(base_size = 16) +
scale_color_manual(values = c("blue", "dark green")) +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
x = "Booth Median Income",
y = "ALP 2pp Percentage", colour = "Coalition Party",
subtitle = "by Polling Booth, Unweighted (NSW)")
polling_place_2pp_clean %>%
mutate(DivisionNm = stringr::str_to_title(DivisionNm)) %>%
inner_join(coalition_contest_2016) %>%
filter(StateAb == "VIC") %>%
ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
colour = PartyNm)) +
geom_point(size = 3) +
theme_classic(base_size = 16) +
scale_color_manual(values = c("blue", "dark green")) +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
x = "Booth Median Income",
y = "ALP 2pp Percentage", colour = "Coalition Party",
subtitle = "by Polling Booth, Unweighted (VIC)")
This effect is less clear in states where the Nationals aren’t as prominent, either because the Nationals aren’t as prominent (SA, WA, TAS), or are merged with the Liberals (QLD).
polling_place_2pp_clean %>%
mutate(DivisionNm = stringr::str_to_title(DivisionNm)) %>%
inner_join(coalition_contest_2016) %>%
filter(StateAb == "WA") %>%
ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
colour = PartyNm)) +
geom_point(size = 3) +
theme_classic(base_size = 16) +
scale_color_manual(values = c("blue", "dark green")) +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
x = "Booth Median Income",
y = "ALP 2pp Percentage", colour = "Coalition Party",
subtitle = "by Polling Booth, Unweighted (WA)")
Perhaps we would be better off using the geographical classifications from the AEC.
webpage <-
read_html("http://results.aec.gov.au/20499/Website/HouseDivisionClassifications-20499-NAT.htm")
division_classifications <- webpage %>%
html_nodes("#divisionClassifications") %>%
html_table(fill = TRUE) %>%
.[[1]]
division_classifications <- division_classifications %>%
filter(Division != "Total Enrolment")
polling_place_2pp_clean <- polling_place_2pp_clean %>%
mutate(Division = stringr::str_to_title(DivisionNm)) %>%
inner_join(division_classifications)
The graph below shows that the booths that have a low ALP 2pp and a low median income are primarily rural booths. This relationship seems stronger than the Lib/Nat split.
polling_place_2pp_clean %>%
filter(StateAb == "NSW") %>%
ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Booth Median Income", colour = "Region",
subtitle = "by Polling Booth, Unweighted (NSW)")
Looking at all states we see a similar relationship, although less strong than in NSW.
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = median_household_income,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::dollar) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Median Booth Income", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
What about some of the other variables?
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = average_household_size,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Average Household Size", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_female, colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent Female", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
dplyr::select(ALP_Percent, Swing, median_age, median_household_income,
average_household_size, persons_per_bedroom, median_weekly_rent,
median_annual_mortgage, percent_female, percent_defacto,
percent_born_in_australia,
percent_unit, percent_mortgage, percent_rent) %>%
cor %>%
kable()
| ALP_Percent | Swing | median_age | median_household_income | average_household_size | persons_per_bedroom | median_weekly_rent | median_annual_mortgage | percent_female | percent_defacto | percent_born_in_australia | percent_unit | percent_mortgage | percent_rent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ALP_Percent | 1.0000000 | -0.2912343 | -0.4107650 | -0.0056240 | 0.1653612 | 0.3613836 | 0.1525497 | 0.1068924 | 0.1182998 | 0.1417798 | -0.3283471 | NA | NA | NA |
| Swing | -0.2912343 | 1.0000000 | 0.0514799 | 0.1105950 | -0.0781497 | 0.0236802 | 0.0899012 | 0.1048528 | 0.0807160 | -0.0001612 | -0.0461574 | NA | NA | NA |
| median_age | -0.4107650 | 0.0514799 | 1.0000000 | -0.4776289 | -0.5107579 | -0.5931916 | -0.4041044 | -0.4171832 | -0.0537144 | -0.0620614 | 0.5213940 | NA | NA | NA |
| median_household_income | -0.0056240 | 0.1105950 | -0.4776289 | 1.0000000 | 0.4138939 | 0.3353107 | 0.8079796 | 0.8679774 | 0.2015761 | -0.0669486 | -0.4025008 | NA | NA | NA |
| average_household_size | 0.1653612 | -0.0781497 | -0.5107579 | 0.4138939 | 1.0000000 | 0.3779623 | 0.3332586 | 0.3183454 | -0.0068105 | -0.5133083 | -0.2969940 | NA | NA | NA |
| persons_per_bedroom | 0.3613836 | 0.0236802 | -0.5931916 | 0.3353107 | 0.3779623 | 1.0000000 | 0.4035784 | 0.3926183 | -0.0036236 | 0.0290107 | -0.5889513 | NA | NA | NA |
| median_weekly_rent | 0.1525497 | 0.0899012 | -0.4041044 | 0.8079796 | 0.3332586 | 0.4035784 | 1.0000000 | 0.9280559 | 0.4262035 | -0.1258483 | -0.6024308 | NA | NA | NA |
| median_annual_mortgage | 0.1068924 | 0.1048528 | -0.4171832 | 0.8679774 | 0.3183454 | 0.3926183 | 0.9280559 | 1.0000000 | 0.3597541 | -0.1104388 | -0.5512846 | NA | NA | NA |
| percent_female | 0.1182998 | 0.0807160 | -0.0537144 | 0.2015761 | -0.0068105 | -0.0036236 | 0.4262035 | 0.3597541 | 1.0000000 | -0.0999105 | -0.1133221 | NA | NA | NA |
| percent_defacto | 0.1417798 | -0.0001612 | -0.0620614 | -0.0669486 | -0.5133083 | 0.0290107 | -0.1258483 | -0.1104388 | -0.0999105 | 1.0000000 | 0.2271308 | NA | NA | NA |
| percent_born_in_australia | -0.3283471 | -0.0461574 | 0.5213940 | -0.4025008 | -0.2969940 | -0.5889513 | -0.6024308 | -0.5512846 | -0.1133221 | 0.2271308 | 1.0000000 | NA | NA | NA |
| percent_unit | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA | NA |
| percent_mortgage | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 | NA |
| percent_rent | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 1 |
Note that there appear to be lots of NAs for the housing data. Need to fix/impute.
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = persons_per_bedroom,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Persons per Bedroom", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_unit, colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent of Dwellings - Unit", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_mortgage,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent of Dwellings - Under Mortgage", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_rent, colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent of Dwellings - Renting", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_indig, colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent Indigeneous", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_born_in_australia,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent Born in Australia", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
polling_place_2pp_clean %>%
ggplot(aes(y = ALP_Percent / 100, x = percent_defacto,
colour = Demographic)) +
geom_point(size = 1) +
theme_classic(base_size = 16) + scale_color_brewer(palette = "Dark2") +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::percent) +
labs(title = "2016 Election: ALP 2 Party Preferred Percentage",
y = "ALP 2pp Percentage",
x = "Percent in a Defacto Relationship", colour = "Region",
subtitle = "by Polling Booth, Unweighted")
Add extra variables Look at lagged results Build models
Can we build a simple linear model to predict 2pp
alp_2pp_lm_demog <-
lm(ALP_Percent ~ median_household_income * Demographic + percent_indig +
percent_female + percent_defacto + percent_born_in_australia +
percent_rent + median_weekly_rent + median_age,
data = polling_place_2pp_clean)
summary(alp_2pp_lm_demog)
##
## Call:
## lm(formula = ALP_Percent ~ median_household_income * Demographic +
## percent_indig + percent_female + percent_defacto + percent_born_in_australia +
## percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.536 -8.043 -0.237 7.809 73.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.644469038 5.585384199 19.631 < 0.0000000000000002 ***
## median_household_income -0.000517611 0.000017675 -29.284 < 0.0000000000000002 ***
## DemographicOuter Metropolitan -1.583368327 1.865918048 -0.849 0.39614
## DemographicProvincial -10.724897471 2.380920835 -4.505 0.00000674955 ***
## DemographicRural -36.155148550 1.977005482 -18.288 < 0.0000000000000002 ***
## percent_indig 16.501075768 2.761240592 5.976 0.00000000239 ***
## percent_female 16.762816547 11.349092093 1.477 0.13971
## percent_defacto 151.352869834 7.954650274 19.027 < 0.0000000000000002 ***
## percent_born_in_australia -9.365438012 2.328767479 -4.022 0.00005834398 ***
## percent_rent -12.770771154 2.509964253 -5.088 0.00000037027 ***
## median_weekly_rent 0.032011772 0.003118428 10.265 < 0.0000000000000002 ***
## median_age -0.890392784 0.046220782 -19.264 < 0.0000000000000002 ***
## median_household_income:DemographicOuter Metropolitan -0.000003321 0.000021314 -0.156 0.87619
## median_household_income:DemographicProvincial 0.000083316 0.000030512 2.731 0.00634 **
## median_household_income:DemographicRural 0.000309088 0.000024982 12.372 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.15 on 7775 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.3995, Adjusted R-squared: 0.3984
## F-statistic: 369.5 on 14 and 7775 DF, p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog)
plot(alp_2pp_lm_demog)
vif(alp_2pp_lm_demog)
## GVIF Df GVIF^(1/(2*Df))
## median_household_income 7.316624 1 2.704926
## Demographic 18261.455172 3 5.131637
## percent_indig 1.706153 1 1.306198
## percent_female 1.650000 1 1.284523
## percent_defacto 1.614552 1 1.270650
## percent_born_in_australia 4.563213 1 2.136168
## percent_rent 3.299426 1 1.816432
## median_weekly_rent 6.269462 1 2.503889
## median_age 3.407630 1 1.845977
## median_household_income:Demographic 13002.761451 3 4.849228
gv_alp_2pp_lm_demog <- gvlma(alp_2pp_lm_demog)
summary(gv_alp_2pp_lm_demog)
##
## Call:
## lm(formula = ALP_Percent ~ median_household_income * Demographic +
## percent_indig + percent_female + percent_defacto + percent_born_in_australia +
## percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.536 -8.043 -0.237 7.809 73.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.644469038 5.585384199 19.631 < 0.0000000000000002 ***
## median_household_income -0.000517611 0.000017675 -29.284 < 0.0000000000000002 ***
## DemographicOuter Metropolitan -1.583368327 1.865918048 -0.849 0.39614
## DemographicProvincial -10.724897471 2.380920835 -4.505 0.00000674955 ***
## DemographicRural -36.155148550 1.977005482 -18.288 < 0.0000000000000002 ***
## percent_indig 16.501075768 2.761240592 5.976 0.00000000239 ***
## percent_female 16.762816547 11.349092093 1.477 0.13971
## percent_defacto 151.352869834 7.954650274 19.027 < 0.0000000000000002 ***
## percent_born_in_australia -9.365438012 2.328767479 -4.022 0.00005834398 ***
## percent_rent -12.770771154 2.509964253 -5.088 0.00000037027 ***
## median_weekly_rent 0.032011772 0.003118428 10.265 < 0.0000000000000002 ***
## median_age -0.890392784 0.046220782 -19.264 < 0.0000000000000002 ***
## median_household_income:DemographicOuter Metropolitan -0.000003321 0.000021314 -0.156 0.87619
## median_household_income:DemographicProvincial 0.000083316 0.000030512 2.731 0.00634 **
## median_household_income:DemographicRural 0.000309088 0.000024982 12.372 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.15 on 7775 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.3995, Adjusted R-squared: 0.3984
## F-statistic: 369.5 on 14 and 7775 DF, p-value: < 0.00000000000000022
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = alp_2pp_lm_demog)
##
## Value p-value Decision
## Global Stat 329.183 0.0000000000 Assumptions NOT satisfied!
## Skewness 24.285 0.0000008309 Assumptions NOT satisfied!
## Kurtosis 271.406 0.0000000000 Assumptions NOT satisfied!
## Link Function 9.194 0.0024278954 Assumptions NOT satisfied!
## Heteroscedasticity 24.298 0.0000008254 Assumptions NOT satisfied!
alp_2pp_lm_demog <-
lm(ALP_Percent ~ median_household_income + Demographic + percent_indig +
percent_defacto + percent_born_in_australia +
percent_rent + median_weekly_rent + median_age,
data = polling_place_2pp_clean)
summary(alp_2pp_lm_demog)
##
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic +
## percent_indig + percent_defacto + percent_born_in_australia +
## percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.420 -8.135 -0.258 8.075 82.895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123.52168682 2.62448183 47.065 < 0.0000000000000002 ***
## median_household_income -0.00044876 0.00001369 -32.780 < 0.0000000000000002 ***
## DemographicOuter Metropolitan -1.56733045 0.46215084 -3.391 0.000699 ***
## DemographicProvincial -2.69135517 0.62742024 -4.290 0.000018119492 ***
## DemographicRural -14.17052058 0.60808623 -23.303 < 0.0000000000000002 ***
## percent_indig 12.91963465 2.76941622 4.665 0.000003136028 ***
## percent_defacto 152.44664157 7.82591302 19.480 < 0.0000000000000002 ***
## percent_born_in_australia -11.21826628 2.14098299 -5.240 0.000000164993 ***
## percent_rent -15.79729218 2.43979124 -6.475 0.000000000101 ***
## median_weekly_rent 0.02988500 0.00276810 10.796 < 0.0000000000000002 ***
## median_age -1.12325830 0.04174571 -26.907 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.3 on 7779 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.3846, Adjusted R-squared: 0.3838
## F-statistic: 486.2 on 10 and 7779 DF, p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog)
plot(alp_2pp_lm_demog)
vif(alp_2pp_lm_demog)
## GVIF Df GVIF^(1/(2*Df))
## median_household_income 4.285242 1 2.070083
## Demographic 4.657881 3 1.292304
## percent_indig 1.675652 1 1.294470
## percent_defacto 1.525730 1 1.235204
## percent_born_in_australia 3.765674 1 1.940534
## percent_rent 3.043732 1 1.744629
## median_weekly_rent 4.823036 1 2.196141
## median_age 2.713934 1 1.647402
crPlots(alp_2pp_lm_demog)
ceresPlots(alp_2pp_lm_demog)
gv_alp_2pp_lm_demog <- gvlma(alp_2pp_lm_demog)
summary(gv_alp_2pp_lm_demog)
##
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic +
## percent_indig + percent_defacto + percent_born_in_australia +
## percent_rent + median_weekly_rent + median_age, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.420 -8.135 -0.258 8.075 82.895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123.52168682 2.62448183 47.065 < 0.0000000000000002 ***
## median_household_income -0.00044876 0.00001369 -32.780 < 0.0000000000000002 ***
## DemographicOuter Metropolitan -1.56733045 0.46215084 -3.391 0.000699 ***
## DemographicProvincial -2.69135517 0.62742024 -4.290 0.000018119492 ***
## DemographicRural -14.17052058 0.60808623 -23.303 < 0.0000000000000002 ***
## percent_indig 12.91963465 2.76941622 4.665 0.000003136028 ***
## percent_defacto 152.44664157 7.82591302 19.480 < 0.0000000000000002 ***
## percent_born_in_australia -11.21826628 2.14098299 -5.240 0.000000164993 ***
## percent_rent -15.79729218 2.43979124 -6.475 0.000000000101 ***
## median_weekly_rent 0.02988500 0.00276810 10.796 < 0.0000000000000002 ***
## median_age -1.12325830 0.04174571 -26.907 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.3 on 7779 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.3846, Adjusted R-squared: 0.3838
## F-statistic: 486.2 on 10 and 7779 DF, p-value: < 0.00000000000000022
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = alp_2pp_lm_demog)
##
## Value p-value Decision
## Global Stat 514.92 0.0000000000000000000 Assumptions NOT satisfied!
## Skewness 32.18 0.0000000140185580921 Assumptions NOT satisfied!
## Kurtosis 377.92 0.0000000000000000000 Assumptions NOT satisfied!
## Link Function 65.68 0.0000000000000005551 Assumptions NOT satisfied!
## Heteroscedasticity 39.14 0.0000000003949938154 Assumptions NOT satisfied!
We know that income is important in the non-rural areas, so it might be worth adding an interaction between rural and non-rural and income.
polling_place_2pp_clean <- polling_place_2pp_clean %>%
mutate(NonRural_Demographic = if_else(Demographic == "Rural", 1, 0),
ALP_Percent_2013 = ALP_Percent + Swing)
alp_2pp_lm_demog <-
lm(ALP_Percent ~ median_household_income + Demographic + percent_indig +
percent_defacto + percent_born_in_australia +
percent_rent + median_weekly_rent + median_age +
NonRural_Demographic:median_household_income,
data = polling_place_2pp_clean)
summary(alp_2pp_lm_demog)
##
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic +
## percent_indig + percent_defacto + percent_born_in_australia +
## percent_rent + median_weekly_rent + median_age + NonRural_Demographic:median_household_income,
## data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.412 -8.045 -0.263 7.812 73.408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.94422059 2.62733142 44.891 < 0.0000000000000002 ***
## median_household_income -0.00051275 0.00001434 -35.751 < 0.0000000000000002 ***
## DemographicOuter Metropolitan -1.91629900 0.45761011 -4.188 0.0000285000653248 ***
## DemographicProvincial -4.78416304 0.63940377 -7.482 0.0000000000000811 ***
## DemographicRural -35.02665851 1.66019796 -21.098 < 0.0000000000000002 ***
## percent_indig 15.73666896 2.74578033 5.731 0.0000000103434805 ***
## percent_defacto 149.07958037 7.74063935 19.259 < 0.0000000000000002 ***
## percent_born_in_australia -8.31193372 2.12750867 -3.907 0.0000942855057573 ***
## percent_rent -13.02413620 2.42071067 -5.380 0.0000000765292289 ***
## median_weekly_rent 0.03336423 0.00274866 12.138 < 0.0000000000000002 ***
## median_age -0.91972431 0.04394585 -20.929 < 0.0000000000000002 ***
## median_household_income:NonRural_Demographic 0.00029327 0.00002176 13.477 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.16 on 7778 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.3987, Adjusted R-squared: 0.3978
## F-statistic: 468.8 on 11 and 7778 DF, p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog)
plot(alp_2pp_lm_demog)
vif(alp_2pp_lm_demog)
## GVIF Df GVIF^(1/(2*Df))
## median_household_income 4.812683 1 2.193783
## Demographic 62.371236 3 1.991425
## percent_indig 1.685419 1 1.298237
## percent_defacto 1.527321 1 1.235848
## percent_born_in_australia 3.804766 1 1.950581
## percent_rent 3.065885 1 1.750967
## median_weekly_rent 4.865961 1 2.205892
## median_age 3.077376 1 1.754245
## median_household_income:NonRural_Demographic 23.061591 1 4.802249
gv_alp_2pp_lm_demog <- gvlma(alp_2pp_lm_demog)
summary(gv_alp_2pp_lm_demog)
##
## Call:
## lm(formula = ALP_Percent ~ median_household_income + Demographic +
## percent_indig + percent_defacto + percent_born_in_australia +
## percent_rent + median_weekly_rent + median_age + NonRural_Demographic:median_household_income,
## data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.412 -8.045 -0.263 7.812 73.408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.94422059 2.62733142 44.891 < 0.0000000000000002 ***
## median_household_income -0.00051275 0.00001434 -35.751 < 0.0000000000000002 ***
## DemographicOuter Metropolitan -1.91629900 0.45761011 -4.188 0.0000285000653248 ***
## DemographicProvincial -4.78416304 0.63940377 -7.482 0.0000000000000811 ***
## DemographicRural -35.02665851 1.66019796 -21.098 < 0.0000000000000002 ***
## percent_indig 15.73666896 2.74578033 5.731 0.0000000103434805 ***
## percent_defacto 149.07958037 7.74063935 19.259 < 0.0000000000000002 ***
## percent_born_in_australia -8.31193372 2.12750867 -3.907 0.0000942855057573 ***
## percent_rent -13.02413620 2.42071067 -5.380 0.0000000765292289 ***
## median_weekly_rent 0.03336423 0.00274866 12.138 < 0.0000000000000002 ***
## median_age -0.91972431 0.04394585 -20.929 < 0.0000000000000002 ***
## median_household_income:NonRural_Demographic 0.00029327 0.00002176 13.477 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.16 on 7778 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.3987, Adjusted R-squared: 0.3978
## F-statistic: 468.8 on 11 and 7778 DF, p-value: < 0.00000000000000022
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = alp_2pp_lm_demog)
##
## Value p-value Decision
## Global Stat 340.87 0.0000000000 Assumptions NOT satisfied!
## Skewness 25.94 0.0000003526 Assumptions NOT satisfied!
## Kurtosis 277.29 0.0000000000 Assumptions NOT satisfied!
## Link Function 11.62 0.0006526462 Assumptions NOT satisfied!
## Heteroscedasticity 26.03 0.0000003362 Assumptions NOT satisfied!
Using a lasso regression,
polling_place_2pp_clean_na <- na.omit(polling_place_2pp_clean) %>%
dplyr::select(ALP_Percent, StateAb, median_age, median_household_income,
average_household_size, persons_per_bedroom, median_weekly_rent,
median_annual_mortgage, percent_female, percent_defacto,
percent_married, percent_indig, percent_born_in_australia,
percent_unit, percent_mortgage, percent_rent, Demographic)
# Inspect the data
sample_n(polling_place_2pp_clean_na, 3)
# Split the data into training and test set
set.seed(123)
training_samples <- polling_place_2pp_clean_na$ALP_Percent %>%
createDataPartition(p = 0.8, list = FALSE)
train_data <- polling_place_2pp_clean_na[training_samples, ]
test_data <- polling_place_2pp_clean_na[-training_samples, ]
# Dummy code categorical predictor variables
x <- model.matrix(ALP_Percent~., train_data)[, -1]
# Convert the outcome (class) to a numerical variable
y <- train_data$ALP_Percent
cv_lasso <- cv.glmnet(x, y, alpha = 1, family = "gaussian")
# Fit the final model on the training data
alp_2pp_lasso_demog <- glmnet(x, y, alpha = 1, family = "gaussian",
lambda = cv_lasso$lambda.min)
# Display regression coefficients
coef(alp_2pp_lasso_demog)
## 25 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 15.0684175489
## StateAbNSW -10.5408695179
## StateAbNT -7.8046459078
## StateAbQLD -16.3022362445
## StateAbSA -12.4863080849
## StateAbTAS -7.5333433685
## StateAbVIC -13.1650588102
## StateAbWA -20.0779651971
## median_age 0.3361809822
## median_household_income -0.0002954546
## average_household_size 6.4905287857
## persons_per_bedroom 35.6780738148
## median_weekly_rent 0.0139127938
## median_annual_mortgage 0.0001118316
## percent_female 35.7858791409
## percent_defacto 171.9936601055
## percent_married -81.0160731481
## percent_indig -42.2185144180
## percent_born_in_australia -23.5404498825
## percent_unit -48.7470498762
## percent_mortgage 46.8260925422
## percent_rent 45.3005289589
## DemographicOuter Metropolitan -3.1254361561
## DemographicProvincial -3.2690834206
## DemographicRural -13.6374572080
summary(alp_2pp_lasso_demog)
## Length Class Mode
## a0 1 -none- numeric
## beta 24 dgCMatrix S4
## df 1 -none- numeric
## dim 2 -none- numeric
## lambda 1 -none- numeric
## dev.ratio 1 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 6 -none- call
## nobs 1 -none- numeric
alp_2pp_lasso_demog$dev.ratio
## [1] 0.5311051
# Make predictions on the test data
x_test <- model.matrix(ALP_Percent ~., test_data)[, -1]
predictions <- alp_2pp_lasso_demog %>% predict(newx = x_test)
# Model accuracy
observed <- test_data$ALP_Percent
plot(predictions, observed)
cor(predictions, observed)
## [,1]
## s0 0.7230284
cor(predictions, observed)^2
## [,1]
## s0 0.5227701
Metrics::rmse(actual = observed,
predicted = predictions)
## [1] 10.68348
What if we include the 2pp from the last election. It appears that Swing is defined as the swing to the Coalition between 2013 and 2016. So if we add the swing to the 2016 2pp, then we obtain the 2013 2pp.
When using a Linear Model, we can get an R^2 of about 84%
polling_place_2pp_clean <- polling_place_2pp_clean %>%
mutate(logit_ALP_Percent =
log((ALP_Percent / 100) / (1 - ALP_Percent / 100)),
logit_ALP_Percent_2013 =
log((ALP_Percent_2013 / 100) / (1 - ALP_Percent_2013 / 100))) %>%
filter(!is.nan(logit_ALP_Percent)) %>%
filter(!is.na(logit_ALP_Percent)) %>%
filter(!is.infinite(logit_ALP_Percent)) %>%
filter(!is.nan(logit_ALP_Percent_2013)) %>%
filter(!is.na(logit_ALP_Percent_2013)) %>%
filter(!is.infinite(logit_ALP_Percent_2013))
alp_2pp_lm_demog_lag <-
lm(logit_ALP_Percent ~ percent_indig + percent_defacto +
percent_rent + median_weekly_rent + median_age +
NonRural_Demographic * median_household_income + logit_ALP_Percent_2013,
data = polling_place_2pp_clean)
summary(alp_2pp_lm_demog_lag)
##
## Call:
## lm(formula = logit_ALP_Percent ~ percent_indig + percent_defacto +
## percent_rent + median_weekly_rent + median_age + NonRural_Demographic *
## median_household_income + logit_ALP_Percent_2013, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1110 -0.1297 -0.0042 0.1305 3.1633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0959091994 0.0517826740 21.164 < 0.0000000000000002 ***
## percent_indig 0.4115344175 0.0603756158 6.816 0.000000000010048187 ***
## percent_defacto 0.5801063962 0.1692442371 3.428 0.000612 ***
## percent_rent -0.3883763689 0.0480686113 -8.080 0.000000000000000748 ***
## median_weekly_rent 0.0002499617 0.0000599165 4.172 0.000030546066445595 ***
## median_age -0.0147752213 0.0009830689 -15.030 < 0.0000000000000002 ***
## NonRural_Demographic -0.2116056651 0.0346441295 -6.108 0.000000001057798155 ***
## median_household_income -0.0000051779 0.0000003219 -16.086 < 0.0000000000000002 ***
## logit_ALP_Percent_2013 0.8974069831 0.0060571128 148.158 < 0.0000000000000002 ***
## NonRural_Demographic:median_household_income 0.0000020939 0.0000004902 4.272 0.000019646216040251 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2802 on 7733 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.8392, Adjusted R-squared: 0.839
## F-statistic: 4485 on 9 and 7733 DF, p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog_lag)
plot(alp_2pp_lm_demog_lag)
vif(alp_2pp_lm_demog_lag)
## percent_indig percent_defacto percent_rent median_weekly_rent median_age NonRural_Demographic median_household_income logit_ALP_Percent_2013 NonRural_Demographic:median_household_income
## 1.502748 1.366026 2.262058 4.328613 2.884030 27.998884 4.532365 1.598839 21.897932
gv_alp_2pp_lm_demog_lag <- gvlma(alp_2pp_lm_demog_lag)
summary(alp_2pp_lm_demog_lag)
##
## Call:
## lm(formula = logit_ALP_Percent ~ percent_indig + percent_defacto +
## percent_rent + median_weekly_rent + median_age + NonRural_Demographic *
## median_household_income + logit_ALP_Percent_2013, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1110 -0.1297 -0.0042 0.1305 3.1633
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0959091994 0.0517826740 21.164 < 0.0000000000000002 ***
## percent_indig 0.4115344175 0.0603756158 6.816 0.000000000010048187 ***
## percent_defacto 0.5801063962 0.1692442371 3.428 0.000612 ***
## percent_rent -0.3883763689 0.0480686113 -8.080 0.000000000000000748 ***
## median_weekly_rent 0.0002499617 0.0000599165 4.172 0.000030546066445595 ***
## median_age -0.0147752213 0.0009830689 -15.030 < 0.0000000000000002 ***
## NonRural_Demographic -0.2116056651 0.0346441295 -6.108 0.000000001057798155 ***
## median_household_income -0.0000051779 0.0000003219 -16.086 < 0.0000000000000002 ***
## logit_ALP_Percent_2013 0.8974069831 0.0060571128 148.158 < 0.0000000000000002 ***
## NonRural_Demographic:median_household_income 0.0000020939 0.0000004902 4.272 0.000019646216040251 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2802 on 7733 degrees of freedom
## (52 observations deleted due to missingness)
## Multiple R-squared: 0.8392, Adjusted R-squared: 0.839
## F-statistic: 4485 on 9 and 7733 DF, p-value: < 0.00000000000000022
It would be interesting to see whether 2013 alone is a good predictor. Fitting this model gives an R^2 of 82.7%. This means that the other variables add a bit, but not a huge amount.
alp_2pp_lm_demog_lag <-
lm(logit_ALP_Percent ~ logit_ALP_Percent_2013,
data = polling_place_2pp_clean)
summary(alp_2pp_lm_demog_lag)
##
## Call:
## lm(formula = logit_ALP_Percent ~ logit_ALP_Percent_2013, data = polling_place_2pp_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2357 -0.1441 -0.0115 0.1335 3.2092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.127440 0.003472 36.7 <0.0000000000000002 ***
## logit_ALP_Percent_2013 0.954281 0.004944 193.0 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2902 on 7793 degrees of freedom
## Multiple R-squared: 0.827, Adjusted R-squared: 0.827
## F-statistic: 3.725e+04 on 1 and 7793 DF, p-value: < 0.00000000000000022
anova(alp_2pp_lm_demog_lag)
plot(alp_2pp_lm_demog_lag)
There are three outlying values - let"s explore these
polling_place_2pp_clean[c(2079, 7267, 2758), ]
These booths have fewer than 40 electors, and fewer than 5 for one of the two parties. They are also “non-standard” booths.
It might be interesting to see which booths deviate from the division mean.
Can we fit a gradient boosting machine to improve fit (while trading off raw explainability)
polling_place_2pp_gbm <- polling_place_2pp_clean %>%
clean_names() %>%
dplyr::select(-year, -division_nm, -polling_place_id, -polling_place,
-division, -premises_post_code, -division_id, -state,
-enrolment, -successful_party, -seat_status, -alp_percent,
-lnp_percent, -lnp_votes, -alp_votes, -total_votes, -swing) %>%
mutate(state_ab = factor(state_ab),
demographic = factor(demographic),
previous_party = factor(previous_party),
previous_seat_status = factor(previous_seat_status))
polling_place_2pp_split <- initial_split(polling_place_2pp_gbm, prop = .8)
polling_place_2pp_train <- training(polling_place_2pp_split)
polling_place_2pp_test <- testing(polling_place_2pp_split)
polling_place_2pp_gbm1 <- gbm::gbm(logit_alp_percent ~.,
data = polling_place_2pp_train,
distribution = "gaussian",
verbose = F,
shrinkage = 0.1,
interaction.depth = 2,
n.minobsinnode = 10,
n.trees = 5000,
cv.folds = 10)
perf_gbm1 <- gbm.perf(polling_place_2pp_gbm1, method = "cv")
polling_place_2pp_prediction_1 <- stats::predict(
object = polling_place_2pp_gbm1,
newdata = polling_place_2pp_test,
n.trees = perf_gbm1)
rmse_fit1 <- Metrics::rmse(actual = polling_place_2pp_test$logit_alp_percent,
predicted = polling_place_2pp_prediction_1)
print(rmse_fit1)
## [1] 0.2518863
data.frame(actual = polling_place_2pp_test$logit_alp_percent,
predicted = polling_place_2pp_prediction_1) %>%
ggplot(aes(x = predicted, y = actual)) + geom_point() +
geom_abline(slope = 1, intercept = 0, colour = "blue") +
theme_classic()
gbm::plot.gbm(polling_place_2pp_gbm1, i.var = 8)
When looking at variable importance, it is clear that the most important factor is the 2pp vote in the previous election.
polling_place_2pp_effects <-
tibble::as_tibble(gbm::summary.gbm(polling_place_2pp_gbm1, plotit = FALSE))
polling_place_2pp_effects %>% utils::head(n = 15)
polling_place_2pp_effects %>%
# arrange descending to get the top influencers
dplyr::arrange(desc(rel.inf)) %>%
# sort to top 10
dplyr::top_n(10) %>%
# plot these data using columns
ggplot(aes(x = forcats::fct_reorder(.f = var,
.x = rel.inf),
y = rel.inf,
fill = rel.inf)) +
geom_col() +
# flip
coord_flip() +
# format
scale_color_brewer(palette = "Dark2") +
theme(axis.title = element_text()) +
theme_classic() +
xlab("Features") +
ylab("Relative Influence") +
ggtitle("Top 10 Drivers of 2pp vote")
We can use the skim function to summarise the covariates and outcome variables that we will be modelling on.
polling_place_2pp_model <- polling_place_2pp_clean %>%
clean_names() %>%
dplyr::select(-year, -division_nm, -polling_place_id, -polling_place,
-division, -premises_post_code, -division_id, -state,
-enrolment, -successful_party, -seat_status, -alp_percent,
-lnp_percent, -lnp_votes, -alp_votes, -total_votes, -swing,
-alp_percent_2013) %>%
mutate(state_ab = factor(state_ab),
demographic = factor(demographic),
previous_party = factor(previous_party),
previous_seat_status = factor(previous_seat_status))
skim(polling_place_2pp_model)
| Name | polling_place_2pp_model |
| Number of rows | 7795 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| factor | 4 |
| numeric | 19 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| state_ab | 0 | 1 | FALSE | 8 | NSW: 2670, VIC: 1740, QLD: 1535, SA: 693 |
| demographic | 0 | 1 | FALSE | 4 | Rur: 2975, Inn: 1944, Out: 1767, Pro: 1109 |
| previous_party | 0 | 1 | FALSE | 9 | Lib: 3092, Aus: 2394, Lib: 1120, The: 839 |
| previous_seat_status | 0 | 1 | FALSE | 3 | Saf: 3276, Mar: 3086, Fai: 1433 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| median_age | 0 | 1.00 | 40.27 | 5.50 | 22.92 | 36.21 | 39.91 | 43.85 | 61.78 | <U+2581><U+2586><U+2587><U+2582><U+2581> |
| median_household_income | 0 | 1.00 | 74774.72 | 21123.80 | 39294.34 | 58286.72 | 70629.04 | 87967.76 | 154667.68 | <U+2587><U+2587><U+2585><U+2582><U+2581> |
| average_household_size | 0 | 1.00 | 2.57 | 0.30 | 1.63 | 2.36 | 2.51 | 2.74 | 5.32 | <U+2583><U+2587><U+2581><U+2581><U+2581> |
| persons_per_bedroom | 0 | 1.00 | 0.83 | 0.11 | 0.70 | 0.80 | 0.80 | 0.89 | 1.98 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| median_weekly_rent | 0 | 1.00 | 318.18 | 110.38 | 16.22 | 242.37 | 320.73 | 382.52 | 785.86 | <U+2582><U+2587><U+2587><U+2581><U+2581> |
| median_annual_mortgage | 0 | 1.00 | 20827.00 | 5666.31 | 265.98 | 16522.31 | 20809.12 | 24219.06 | 41241.53 | <U+2581><U+2583><U+2587><U+2583><U+2581> |
| percent_female | 0 | 1.00 | 0.50 | 0.02 | 0.32 | 0.50 | 0.51 | 0.51 | 0.54 | <U+2581><U+2581><U+2581><U+2582><U+2587> |
| percent_defacto | 0 | 1.00 | 0.08 | 0.02 | 0.02 | 0.06 | 0.08 | 0.09 | 0.25 | <U+2583><U+2587><U+2581><U+2581><U+2581> |
| percent_married | 0 | 1.00 | 0.35 | 0.05 | 0.06 | 0.32 | 0.35 | 0.38 | 0.51 | <U+2581><U+2581><U+2583><U+2587><U+2581> |
| percent_indig | 0 | 1.00 | 0.03 | 0.06 | 0.00 | 0.01 | 0.02 | 0.04 | 0.95 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| percent_born_in_australia | 0 | 1.00 | 0.71 | 0.13 | 0.19 | 0.62 | 0.74 | 0.81 | 0.99 | <U+2581><U+2582><U+2583><U+2587><U+2582> |
| percent_unit | 52 | 0.99 | 0.09 | 0.14 | 0.00 | 0.01 | 0.03 | 0.10 | 0.80 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| percent_mortgage | 52 | 0.99 | 0.32 | 0.09 | 0.00 | 0.26 | 0.31 | 0.37 | 0.69 | <U+2581><U+2585><U+2587><U+2582><U+2581> |
| percent_rent | 52 | 0.99 | 0.27 | 0.10 | 0.05 | 0.20 | 0.26 | 0.32 | 0.89 | <U+2585><U+2587><U+2582><U+2581><U+2581> |
| latitude | 33 | 1.00 | -32.61 | 6.86 | -43.42 | -36.46 | -33.86 | -30.51 | 57.93 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
| longitude | 33 | 1.00 | 144.45 | 15.14 | -95.71 | 144.66 | 147.84 | 151.21 | 167.95 | <U+2581><U+2581><U+2581><U+2581><U+2587> |
| non_rural_demographic | 0 | 1.00 | 0.38 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | <U+2587><U+2581><U+2581><U+2581><U+2585> |
| logit_alp_percent | 0 | 1.00 | -0.09 | 0.70 | -3.77 | -0.52 | -0.10 | 0.37 | 3.33 | <U+2581><U+2581><U+2587><U+2582><U+2581> |
| logit_alp_percent_2013 | 0 | 1.00 | -0.23 | 0.66 | -4.03 | -0.64 | -0.22 | 0.20 | 2.71 | <U+2581><U+2581><U+2587><U+2585><U+2581> |
We can see that there is some missing data for latitude, longitude, and dwelling statistics. We will impute these using KNN imputation.
polling_place_2pp_split <- initial_split(polling_place_2pp_model, prop = .8)
polling_place_2pp_train <- training(polling_place_2pp_split)
polling_place_2pp_test <- testing(polling_place_2pp_split)
training_folds <- vfold_cv(polling_place_2pp_train, v = 10)
polling_place_2pp_rec <- recipe(logit_alp_percent ~ .,
data = polling_place_2pp_train) %>%
step_knnimpute(all_predictors()) %>%
step_dummy(all_predictors(), -all_numeric())
metric <- metric_set(rmse)
Create some candidate models
ctrl_grid <- control_stack_grid()
ctrl_res <- control_stack_resamples()
lm_spec <-
linear_reg() %>%
set_engine("lm")
lm_wflow <-
workflow() %>%
add_model(lm_spec) %>%
add_recipe(polling_place_2pp_rec)
lm_resample <-
fit_resamples(
lm_wflow,
resamples = training_folds,
metrics = metric,
control = ctrl_res
)
lasso_spec <-
linear_reg(penalty = tune("pen"), mixture = tune("mix")) %>%
set_engine("glmnet")
lasso_wflow <-
workflow() %>%
add_model(lasso_spec) %>%
add_recipe(polling_place_2pp_rec)
#Fix
lasso_resample <-
tune_grid(
lasso_wflow,
resamples = training_folds,
metrics = metric,
grid = 6,
control = ctrl_res
)
random_forest_spec <-
rand_forest(mode = "regression", mtry = tune("num_pred"),
min_n = tune("min_n")) %>%
set_engine("ranger")
random_forest_wflow <-
workflow() %>%
add_model(random_forest_spec) %>%
add_recipe(polling_place_2pp_rec)
#Fix
random_forest_resample <-
tune_grid(
random_forest_wflow,
resamples = training_folds,
metrics = metric,
grid = 6,
control = ctrl_res
)
knn_spec <-
nearest_neighbor(
mode = "regression",
neighbors = tune("k")
) %>%
set_engine("kknn")
knn_wflow <-
workflow() %>%
add_model(knn_spec) %>%
add_recipe(polling_place_2pp_rec)
#Fix
knn_resample <-
tune_grid(
knn_wflow,
resamples = training_folds,
metrics = metric,
grid = 6,
control = ctrl_res
)
gbm_spec <-
boost_tree(
min_n = tune("min_n"),
tree_depth = tune("tree_depth"),
learn_rate = tune("learn_rate"),
loss_reduction = tune("loss_reduction")
) %>%
set_engine("xgboost") %>%
set_mode("regression")
gbm_wflow <-
workflow() %>%
add_model(gbm_spec) %>%
add_recipe(polling_place_2pp_rec)
#Fix
gbm_resample <-
tune_grid(
gbm_wflow,
resamples = training_folds,
metrics = metric,
grid = 80,
control = ctrl_res
)
svm_spec <-
svm_rbf(
cost = tune("cost"),
rbf_sigma = tune("sigma")
) %>%
set_engine("kernlab") %>%
set_mode("regression")
svm_wflow <-
workflow() %>%
add_model(svm_spec) %>%
add_recipe(polling_place_2pp_rec)
#Fix
svm_resample <-
tune_grid(
svm_wflow,
resamples = training_folds,
metrics = metric,
grid = 6,
control = ctrl_res
)
polling_place_2pp_stack <-
stacks() %>%
add_candidates(lm_resample) %>%
add_candidates(lasso_resample) %>%
add_candidates(random_forest_resample) %>%
add_candidates(knn_resample) %>%
add_candidates(gbm_resample) %>%
add_candidates(svm_resample)
## Warning: Predictions from the candidates c("gbm_resample_1_62", "gbm_resample_1_02", "gbm_resample_1_69", "gbm_resample_1_44", "gbm_resample_1_79", "gbm_resample_1_33", "gbm_resample_1_80", "gbm_resample_1_71", "gbm_resample_1_27", "gbm_resample_1_61", "gbm_resample_1_35", "gbm_resample_1_32", "gbm_resample_1_55", "gbm_resample_1_08", "gbm_resample_1_21") were identical to those from existing candidates and were removed from the data stack.
polling_place_2pp_stack <-
polling_place_2pp_stack %>%
blend_predictions()
theme_set(theme_classic(base_size = 16))
autoplot(polling_place_2pp_stack)
autoplot(polling_place_2pp_stack, type = "weights")
polling_place_2pp_stack <-
polling_place_2pp_stack %>%
fit_members()
collect_parameters(polling_place_2pp_stack, "lasso_resample")
collect_parameters(polling_place_2pp_stack, "knn_resample")
collect_parameters(polling_place_2pp_stack, "random_forest_resample")
collect_parameters(polling_place_2pp_stack, "gbm_resample")
collect_parameters(polling_place_2pp_stack, "svm_resample")
polling_place_2pp_test <-
polling_place_2pp_test %>%
bind_cols(predict(polling_place_2pp_stack, .))
## Warning in predict.lm(object = object$fit, newdata = new_data, type = "response"): prediction from a rank-deficient fit may be misleading
ggplot(polling_place_2pp_test) +
aes(x = logit_alp_percent,
y = .pred) +
geom_point() +
coord_obs_pred()
member_preds <-
polling_place_2pp_test %>%
dplyr::select(logit_alp_percent) %>%
bind_cols(predict(polling_place_2pp_stack, polling_place_2pp_test,
members = TRUE))
## Warning in predict.lm(object = object$fit, newdata = new_data, type = "response"): prediction from a rank-deficient fit may be misleading
map_dfr(member_preds, rmse, truth = logit_alp_percent, data = member_preds) %>%
mutate(member = colnames(member_preds))
map_dfr(member_preds, rsq, truth = logit_alp_percent, data = member_preds) %>%
mutate(member = colnames(member_preds))
Interestingly, an ensemble only gives us a 1.5% increase in R^2 over a linear model, with the best random forests outperforming linear regression by about 1%